{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### This jupyter notebooks provides the code for classifying ECG signals using the Discrete Wavelet Transform.\n",
    "### To get some more background information, please have a look at the accompanying blog-post:\n",
    "### http://ataspinar.com/2018/12/21/a-guide-for-using-the-wavelet-transform-in-machine-learning/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import time\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import scipy.io as sio\n",
    "from scipy.fftpack import fft\n",
    "from IPython.display import display\n",
    "\n",
    "import pywt\n",
    "import scipy.stats\n",
    "\n",
    "import datetime as dt\n",
    "from collections import defaultdict, Counter\n",
    "\n",
    "from sklearn.ensemble import GradientBoostingClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def calculate_entropy(list_values):\n",
    "    counter_values = Counter(list_values).most_common()\n",
    "    probabilities = [elem[1]/len(list_values) for elem in counter_values]\n",
    "    entropy=scipy.stats.entropy(probabilities)\n",
    "    return entropy\n",
    "\n",
    "def calculate_statistics(list_values):\n",
    "    n5 = np.nanpercentile(list_values, 5)\n",
    "    n25 = np.nanpercentile(list_values, 25)\n",
    "    n75 = np.nanpercentile(list_values, 75)\n",
    "    n95 = np.nanpercentile(list_values, 95)\n",
    "    median = np.nanpercentile(list_values, 50)\n",
    "    mean = np.nanmean(list_values)\n",
    "    std = np.nanstd(list_values)\n",
    "    var = np.nanvar(list_values)\n",
    "    rms = np.nanmean(np.sqrt(list_values**2))\n",
    "    return [n5, n25, n75, n95, median, mean, std, var, rms]\n",
    "\n",
    "def calculate_crossings(list_values):\n",
    "    zero_crossing_indices = np.nonzero(np.diff(np.array(list_values) > 0))[0]\n",
    "    no_zero_crossings = len(zero_crossing_indices)\n",
    "    mean_crossing_indices = np.nonzero(np.diff(np.array(list_values) > np.nanmean(list_values)))[0]\n",
    "    no_mean_crossings = len(mean_crossing_indices)\n",
    "    return [no_zero_crossings, no_mean_crossings]\n",
    "\n",
    "def get_features(list_values):\n",
    "    entropy = calculate_entropy(list_values)\n",
    "    crossings = calculate_crossings(list_values)\n",
    "    statistics = calculate_statistics(list_values)\n",
    "    return [entropy] + crossings + statistics\n",
    "\n",
    "def get_uci_har_features(dataset, labels, waveletname):\n",
    "    uci_har_features = []\n",
    "    for signal_no in range(0, len(dataset)):\n",
    "        features = []\n",
    "        for signal_comp in range(0,dataset.shape[2]):\n",
    "            signal = dataset[signal_no, :, signal_comp]\n",
    "            list_coeff = pywt.wavedec(signal, waveletname)\n",
    "            for coeff in list_coeff:\n",
    "                features += get_features(coeff)\n",
    "        uci_har_features.append(features)\n",
    "    X = np.array(uci_har_features)\n",
    "    Y = np.array(labels)\n",
    "    return X, Y\n",
    "\n",
    "def get_train_test(df, y_col, x_cols, ratio):\n",
    "    \"\"\" \n",
    "    This method transforms a dataframe into a train and test set, for this you need to specify:\n",
    "    1. the ratio train : test (usually 0.7)\n",
    "    2. the column with the Y_values\n",
    "    \"\"\"\n",
    "    mask = np.random.rand(len(df)) < ratio\n",
    "    df_train = df[mask]\n",
    "    df_test = df[~mask]\n",
    "       \n",
    "    Y_train = df_train[y_col].values\n",
    "    Y_test = df_test[y_col].values\n",
    "    X_train = df_train[x_cols].values\n",
    "    X_test = df_test[x_cols].values\n",
    "    return df_train, df_test, X_train, Y_train, X_test, Y_test"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 1. Loading the ECG Dataset\n",
    "Download from here: https://github.com/mathworks/physionet_ECG_data/blob/master/ECGData.zip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = './data/ECG_data/ECGData.mat'\n",
    "ecg_data = sio.loadmat(filename)\n",
    "ecg_signals = ecg_data['ECGData'][0][0][0]\n",
    "ecg_labels_ = ecg_data['ECGData'][0][0][1]\n",
    "ecg_labels = list(map(lambda x: x[0][0], ecg_labels_))\n",
    "\n",
    "dict_ecg_data = defaultdict(list)\n",
    "for ii, label in enumerate(ecg_labels):\n",
    "    dict_ecg_data[label].append(ecg_signals[ii])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 2. Calculating the features per ECG signal\n",
    "## And generating the training and test sets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "list_labels = []\n",
    "list_features = []\n",
    "for k, v in dict_ecg_data.items():\n",
    "    yval = list(dict_ecg_data.keys()).index(k)\n",
    "    for signal in v:\n",
    "        features = []\n",
    "        list_labels.append(yval)\n",
    "        list_coeff = pywt.wavedec(signal, 'sym5')\n",
    "        for coeff in list_coeff:\n",
    "            features += get_features(coeff)\n",
    "        list_features.append(features)\n",
    "df = pd.DataFrame(list_features)\n",
    "ycol = 'y'\n",
    "xcols = list(range(df.shape[1]))\n",
    "df.loc[:,ycol] = list_labels\n",
    "\n",
    "df_train, df_test, X_train, Y_train, X_test, Y_test = get_train_test(df, ycol, xcols, ratio = 0.5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 3. Training the Gradient Boosting Classifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The Train Score is 1.0\n",
      "The Test Score is 0.90625\n"
     ]
    }
   ],
   "source": [
    "cls = GradientBoostingClassifier(n_estimators=10000)\n",
    "cls.fit(X_train, Y_train)\n",
    "train_score = cls.score(X_train, Y_train)\n",
    "test_score = cls.score(X_test, Y_test)\n",
    "print(\"The Train Score is {}\".format(train_score))\n",
    "print(\"The Test Score is {}\".format(test_score))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
